Importing Libraries¶
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
Imports from Scikit learn¶
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report
Loading data¶
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
df.shape
(7043, 21)
The data set includes information about:¶
Customers who left within the last month – the column is called Churn
Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
Customer account information - how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
Demographic info about customers – gender, age range, and if they have partners and dependents
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
df.columns.values
array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
'TotalCharges', 'Churn'], dtype=object)
df.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
Visualizing missing values¶
From the visualisation we can observe that it has no peculiar pattern that stands out. In fact there is no missing data as obserbved below.
# Visualize missing values as a matrix
msno.matrix(df);
Manipulating and understanding data.¶
Dropping off Customer ID column
df = df.drop(['customerID'], axis = 1)
df.head()
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
Analyzing in depth. Checking for null values in particular columns.
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()
gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64
Checking for TotalCharges Column.
df[np.isnan(df['TotalCharges'])]
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 488 | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | No | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | NaN | No |
| 753 | Male | 0 | No | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | NaN | No |
| 936 | Female | 0 | Yes | Yes | 0 | Yes | No | DSL | Yes | Yes | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | NaN | No |
| 1082 | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | NaN | No |
| 1340 | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | Yes | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | NaN | No |
| 3331 | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | NaN | No |
| 3826 | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | NaN | No |
| 4380 | Female | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | NaN | No |
| 5218 | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | NaN | No |
| 6670 | Female | 0 | Yes | Yes | 0 | Yes | Yes | DSL | No | Yes | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | NaN | No |
| 6754 | Male | 0 | No | Yes | 0 | Yes | Yes | DSL | Yes | Yes | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | NaN | No |
It can also be noted that the Tenure column is 0 for these entries even though the MonthlyCharges column is not empty.
Let's see if there are any other 0 values in the tenure column.
df[df['tenure'] == 0].index
Index([488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754], dtype='int64')
Deleting the rows with missing values in Tenure columns since there are only 11 rows and deleting them will not affect the data.¶
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
df[df['tenure'] == 0].index
Index([], dtype='int64')
Filling the values of Totalcharges column with the mean values.
df.fillna(df["TotalCharges"].mean())
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.50 | No |
| 2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | No | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.50 | No |
| 7039 | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | Yes | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.90 | No |
| 7040 | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | No | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
| 7041 | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.60 | Yes |
| 7042 | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.50 | No |
7032 rows × 20 columns
df.isnull().sum()
gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
Labels for Senior Citizen column
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
df.head()
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | Male | No | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.50 | No |
| 2 | Male | No | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | Male | No | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | Female | No | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
Checking Categorical and numeric data.
df["InternetService"].describe(include=['object', 'bool'])
count 7032 unique 3 top Fiber optic freq 3096 Name: InternetService, dtype: object
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()
| tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|
| count | 7032.000000 | 7032.000000 | 7032.000000 |
| mean | 32.421786 | 64.798208 | 2283.300441 |
| std | 24.545260 | 30.085974 | 2266.771362 |
| min | 1.000000 | 18.250000 | 18.800000 |
| 25% | 9.000000 | 35.587500 | 401.450000 |
| 50% | 29.000000 | 70.350000 | 1397.475000 |
| 75% | 55.000000 | 89.862500 | 3794.737500 |
| max | 72.000000 | 118.750000 | 8684.800000 |
Visualizing the data¶
- 26.6 % of customers switched to another firm.
- Customers are 49.5 % female and 50.5 % male.
g_labels = ['Male', 'Female']
c_labels = ['No', 'Yes']
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"),
1, 1)
fig.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"),
1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)
fig.update_layout(
title_text="Gender and Churn Distributions",
# Add annotations in the center of the donut pies.
annotations=[dict(text='Gender', x=0.19, y=0.5, font_size=20, showarrow=False),
dict(text='Churn', x=0.81, y=0.5, font_size=20, showarrow=False)])
fig.show()
No of users which churn or not by gender
df["Churn"][df["Churn"]=="No"].groupby(by=df["gender"]).count() # Did not churn
gender Female 2544 Male 2619 Name: Churn, dtype: int64
df["Churn"][df["Churn"]=="Yes"].groupby(by=df["gender"]).count() # churn
gender Female 939 Male 930 Name: Churn, dtype: int64
Churn distribution as per gender¶
There is negligible difference in customer percentage/ count who chnaged the service provider. Both genders behaved in similar fashion when it comes to migrating to another service provider/firm.
# Data
labels_outer = ['Churn: Yes', 'Churn: No']
values_outer = [1869, 5163]
colors_outer = ['#FF6F61', '#6BAED6']
labels_inner = ['Female (Yes)', 'Male (Yes)', 'Female (No)', 'Male (No)']
values_inner = [939, 930, 2544, 2619]
colors_inner = ['#E0BBE4', '#957DAD', '#AED9E0', '#7FC8A9']
# Create outer ring (Churn Yes/No)
fig = go.Figure()
fig.add_trace(go.Pie(
labels=labels_outer,
values=values_outer,
hole=0.5,
direction='clockwise',
sort=False,
marker=dict(colors=colors_outer, line=dict(color='#FFFFFF', width=2)),
textinfo='label+percent',
textfont_size=16,
name='Churn',
domain={'x': [0, 1], 'y': [0, 1]},
hoverinfo='label+value+percent',
showlegend=True
))
# Create inner ring (Gender breakdown)
fig.add_trace(go.Pie(
labels=labels_inner,
values=values_inner,
hole=0.75,
direction='clockwise',
sort=False,
marker=dict(colors=colors_inner, line=dict(color='#FFFFFF', width=2)),
textinfo='label',
textfont_size=14,
name='Gender',
domain={'x': [0, 1], 'y': [0, 1]},
hoverinfo='label+value',
showlegend=False
))
# Layout
fig.update_layout(
title_text="Churn Distribution by Gender)",
annotations=[dict(text='Churn', x=0.5, y=0.5, font_size=20, showarrow=False)],
height=600,
width=600
)
fig.show()
Contract distribution of customers¶
About 75% of customer with Month-to-Month Contract opted to move out as compared to 13% of customrs with One Year Contract and 3% with Two Year Contract
colors = ['#1f77b4', '#2ca02c', '#d62728']
fig = px.histogram(
df,
x="Churn",
color="Contract",
barmode="group",
title="<b>Customer Contract Distribution</b>",
color_discrete_sequence = colors
)
fig.update_layout(
width=700,
height=500,
bargap=0.1,
# plot_bgcolor='#1e1e1e', # dark background
# paper_bgcolor='#1e1e1e',
font=dict(color='black'),
title_font=dict(size=20, color='black')
)
fig.show()
Payment method distribution¶
# Get labels and values
labels = df['PaymentMethod'].value_counts().index
values = df['PaymentMethod'].value_counts().values
# Colors: soft tones
colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3']
# Plot
plt.figure(figsize=(6.5, 6.5))
plt.pie(
values,
labels=labels,
autopct='%1.1f%%',
startangle=90,
colors=colors,
wedgeprops={'edgecolor': 'white', 'linewidth': 2},
textprops={'fontsize': 13}
)
# Title and formatting
plt.title("Payment Method Distribution", fontsize=16, weight='bold')
plt.axis('equal') # Equal aspect ratio ensures it's a circle
plt.tight_layout()
plt.show()
Payment method distribution w.r.t. churn¶
- Major customers who moved out were having Electronic Check as Payment Method.
- Customers who opted for Credit-Card automatic transfer or Bank Automatic Transfer and Mailed Check as Payment Method were less likely to move out.
dark_colors = ["#1f26b4", "#690e0e", "#093e09", "#ee7777"]
fig = px.histogram(
df,
x="Churn",
color="PaymentMethod",
title="<b>Customer Payment Method Distribution w.r.t. Churn</b>",
color_discrete_sequence=dark_colors,
barmode="stack" )
fig.update_layout(
width=700,
height=500,
bargap=0.1,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(color='black'),
title_font=dict(size=18, color='black'),
legend_title_text='Payment Method'
)
fig.show()
Churn distribution with internet services and gender¶
A lot of customers choose the Fiber optic service and it's also evident that the customers who use Fiber optic have high churn rate, this might suggest a dissatisfaction with this type of internet service.
Customers having DSL service are majority in number and have less churn rate compared to Fibre optic service.
df["InternetService"].unique()
array(['DSL', 'Fiber optic', 'No'], dtype=object)
df[df["gender"]=="Male"][["InternetService", "Churn"]].value_counts()
InternetService Churn DSL No 992 Fiber optic No 910 No No 717 Fiber optic Yes 633 DSL Yes 240 No Yes 57 Name: count, dtype: int64
df[df["gender"]=="Female"][["InternetService", "Churn"]].value_counts()
InternetService Churn DSL No 965 Fiber optic No 889 No No 690 Fiber optic Yes 664 DSL Yes 219 No Yes 56 Name: count, dtype: int64
fig = go.Figure()
fig.add_trace(go.Bar(
x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
["Female", "Male", "Female", "Male"]],
y = [965, 992, 219, 240],
name = 'DSL',
))
fig.add_trace(go.Bar(
x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
["Female", "Male", "Female", "Male"]],
y = [889, 910, 664, 633],
name = 'Fiber optic',
))
fig.add_trace(go.Bar(
x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
["Female", "Male", "Female", "Male"]],
y = [690, 717, 56, 57],
name = 'No Internet',
))
fig.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")
fig.show()
Distribution w.r.t dependents
color_map = {"Yes": "#DEADDE", "No": "#2D1546"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Churn distribution w.r.t. partners
color_map = {"Yes": "#672B2B", "No": "#2D2A63"}
fig = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Chrun distribution w.r.t. Partners</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Churn map w.r.t. senior citizen
- It can be observed that the fraction of senior citizen is very less.
- Most of the senior citizens churn.
color_map = {"Yes": "#1777A7", "No": "#A54367"}
fig = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Chrun distribution w.r.t. Senior Citizen</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Churn w.r.t. online security
- It has been observed that the customers churn in the absence of online security.
color_map = {"Yes": "#22A431", "No": "#DF5757"}
fig = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Churn distribution w.r.t paperless billing
Customers with paperless billing are less likely to churn
color_map = {"Yes": '#1777A7', "No": '#A54367'}
fig = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Chrun distribution w.r.t. Paperless Billing</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Churn distribution with respect to Tech Support
Customers with no TechSupport are most likely to migrate to another service provide
fig = px.histogram(df, x="Churn", color="TechSupport",barmode="group", title="<b>Chrun distribution w.r.t. TechSupport</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Churn distribution w.r.t. phone services
Very small fraction of customers don't have a phone service and out of that, 1/3rd Customers are more likely to churn.
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="PhoneService", title="<b>Chrun distribution w.r.t. Phone Service</b>", color_discrete_map=color_map)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Distribution of monthly charges by churn
Customers with higher monthly charges are more likely to churn
sns.set_context("paper",font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No') ],
color="Red", shade = True);
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes') ],
ax =ax, color="Blue", shade= True);
ax.legend(["Not Churn","Churn"],loc='upper right');
ax.set_ylabel('Density');
ax.set_xlabel('Monthly Charges');
ax.set_title('Distribution of monthly charges by churn');
Distribution of total charges by churn
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'No') ],
color="Red", shade = True);
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'Yes') ],
ax =ax, color="Blue", shade= True);
ax.legend(["Not Chu0rn","Churn"],loc='upper right');
ax.set_ylabel('Density');
ax.set_xlabel('Total Charges');
ax.set_title('Distribution of total charges by churn');
Tenure vs Churn
New customers are more likely to churn
fig = px.box(df, x='Churn', y = 'tenure')
# Update yaxis properties
fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Churn', row=1, col=1)
# Update size and title
fig.update_layout(autosize=True, width=750, height=600,
title_font=dict(size=25, family='Courier'),
title='<b>Tenure vs Churn</b>',
)
fig.show()
plt.figure(figsize=(25, 10))
corr = df.apply(lambda x: pd.factorize(x)[0]).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)
Data Preprocessing and Applying ML models¶
def object_to_int(dataframe_series):
if dataframe_series.dtype=='object':
dataframe_series = LabelEncoder().fit_transform(dataframe_series)
return dataframe_series
df = df.apply(lambda x: object_to_int(x))
df.head()
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 29.85 | 29.85 | 0 |
| 1 | 1 | 0 | 0 | 0 | 34 | 1 | 0 | 0 | 2 | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 56.95 | 1889.50 | 0 |
| 2 | 1 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 53.85 | 108.15 | 1 |
| 3 | 1 | 0 | 0 | 0 | 45 | 0 | 1 | 0 | 2 | 0 | 2 | 2 | 0 | 0 | 1 | 0 | 0 | 42.30 | 1840.75 | 0 |
| 4 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 70.70 | 151.65 | 1 |
plt.figure(figsize=(14,7))
df.corr()['Churn'].sort_values(ascending = False)
Churn 1.000000 MonthlyCharges 0.192858 PaperlessBilling 0.191454 SeniorCitizen 0.150541 PaymentMethod 0.107852 MultipleLines 0.038043 PhoneService 0.011691 gender -0.008545 StreamingTV -0.036303 StreamingMovies -0.038802 InternetService -0.047097 Partner -0.149982 Dependents -0.163128 DeviceProtection -0.177883 OnlineBackup -0.195290 TotalCharges -0.199484 TechSupport -0.282232 OnlineSecurity -0.289050 tenure -0.354049 Contract -0.396150 Name: Churn, dtype: float64
<Figure size 1400x700 with 0 Axes>
X = df.drop(columns = ['Churn'])
y = df['Churn'].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 40, stratify=y)
def distplot(feature, frame, color='r'):
plt.figure(figsize=(8,3))
plt.title("Distribution for {}".format(feature))
ax = sns.distplot(frame[feature], color= color)
Since the numerical features are distributed over different value ranges, I will use standard scalar to scale them down to the same range.
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)
Standardizing numeric attributes
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')),
columns=num_cols)
for feat in numerical_cols: distplot(feat, df_std, color='g')
Divide the columns into 3 categories, one ofor standardisation, one for label encoding and one for one hot encoding¶
cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(num_cols) - set(cat_cols_ohe)) #those that need label encoding
scaler= StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
Machine learning model evaluations and predictions¶
KNN¶
knn_model = KNeighborsClassifier(n_neighbors = 11)
knn_model.fit(X_train,y_train)
predicted_y = knn_model.predict(X_test)
accuracy_knn = knn_model.score(X_test,y_test)
print("KNN accuracy:",accuracy_knn)
KNN accuracy: 0.776303317535545
print(classification_report(y_test, predicted_y))
precision recall f1-score support
0 0.83 0.87 0.85 1549
1 0.59 0.52 0.55 561
accuracy 0.78 2110
macro avg 0.71 0.69 0.70 2110
weighted avg 0.77 0.78 0.77 2110
SVC¶
svc_model = SVC(random_state = 1)
svc_model.fit(X_train,y_train)
predict_y = svc_model.predict(X_test)
accuracy_svc = svc_model.score(X_test,y_test)
print("SVM accuracy is :",accuracy_svc)
SVM accuracy is : 0.8075829383886256
print(classification_report(y_test, predict_y))
precision recall f1-score support
0 0.84 0.92 0.88 1549
1 0.69 0.50 0.58 561
accuracy 0.81 2110
macro avg 0.76 0.71 0.73 2110
weighted avg 0.80 0.81 0.80 2110
Random Forest¶
model_rf = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,
random_state =50, max_features = "sqrt",
max_leaf_nodes = 30)
model_rf.fit(X_train, y_train)
# Make predictions
prediction_test = model_rf.predict(X_test)
print (metrics.accuracy_score(y_test, prediction_test))
0.8137440758293839
print(classification_report(y_test, prediction_test))
precision recall f1-score support
0 0.84 0.92 0.88 1549
1 0.71 0.51 0.59 561
accuracy 0.81 2110
macro avg 0.77 0.72 0.74 2110
weighted avg 0.80 0.81 0.80 2110
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, prediction_test),
annot=True,fmt = "d",linecolor="k",linewidths=3)
plt.title(" RANDOM FOREST CONFUSION MATRIX",fontsize=14)
plt.show()
y_rfpred_prob = model_rf.predict_proba(X_test)[:,1]
fpr_rf, tpr_rf, thresholds = roc_curve(y_test, y_rfpred_prob)
plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr_rf, tpr_rf, label='Random Forest',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve',fontsize=16)
plt.show();
Logistic Regression¶
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)
accuracy_lr = lr_model.score(X_test,y_test)
print("Logistic Regression accuracy is :",accuracy_lr)
Logistic Regression accuracy is : 0.8090047393364929
lr_pred= lr_model.predict(X_test)
report = classification_report(y_test,lr_pred)
print(report)
precision recall f1-score support
0 0.85 0.89 0.87 1549
1 0.66 0.58 0.62 561
accuracy 0.81 2110
macro avg 0.76 0.74 0.75 2110
weighted avg 0.80 0.81 0.80 2110
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, lr_pred),
annot=True,fmt = "d",linecolor="k",linewidths=3)
plt.title("LOGISTIC REGRESSION CONFUSION MATRIX",fontsize=14)
plt.show()
y_pred_prob = lr_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr, tpr, label='Logistic Regression',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve',fontsize=16)
plt.show();
Decision tree classifier¶
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
predictdt_y = dt_model.predict(X_test)
accuracy_dt = dt_model.score(X_test,y_test)
print("Decision Tree accuracy is :",accuracy_dt)
Decision Tree accuracy is : 0.7246445497630332
print(classification_report(y_test, predictdt_y))
precision recall f1-score support
0 0.82 0.80 0.81 1549
1 0.48 0.52 0.50 561
accuracy 0.72 2110
macro avg 0.65 0.66 0.65 2110
weighted avg 0.73 0.72 0.73 2110
Ada Boost Classifier¶
a_model = AdaBoostClassifier()
a_model.fit(X_train,y_train)
a_preds = a_model.predict(X_test)
print("AdaBoost Classifier accuracy")
metrics.accuracy_score(y_test, a_preds)
AdaBoost Classifier accuracy
0.8127962085308057
print(classification_report(y_test, a_preds))
precision recall f1-score support
0 0.85 0.90 0.88 1549
1 0.68 0.56 0.62 561
accuracy 0.81 2110
macro avg 0.76 0.73 0.75 2110
weighted avg 0.81 0.81 0.81 2110
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, a_preds),
annot=True,fmt = "d",linecolor="k",linewidths=3)
plt.title("AdaBoost Classifier Confusion Matrix",fontsize=14)
plt.show()
Gradient Boosting Classifier¶
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
print("Gradient Boosting Classifier", accuracy_score(y_test, gb_pred))
Gradient Boosting Classifier 0.8075829383886256
print(classification_report(y_test, gb_pred))
precision recall f1-score support
0 0.85 0.90 0.87 1549
1 0.67 0.55 0.60 561
accuracy 0.81 2110
macro avg 0.76 0.73 0.74 2110
weighted avg 0.80 0.81 0.80 2110
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, gb_pred),
annot=True,fmt = "d",linecolor="k",linewidths=3)
plt.title("Gradient Boosting Classifier Confusion Matrix",fontsize=14)
plt.show()
Voting Classifier¶
from sklearn.ensemble import VotingClassifier
clf1 = GradientBoostingClassifier()
clf2 = LogisticRegression()
clf3 = AdaBoostClassifier()
eclf1 = VotingClassifier(estimators=[('gbc', clf1), ('lr', clf2), ('abc', clf3)], voting='soft')
eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)
print("Final Accuracy Score ")
print(accuracy_score(y_test, predictions))
Final Accuracy Score 0.8161137440758294
print(classification_report(y_test, predictions))
precision recall f1-score support
0 0.85 0.90 0.88 1549
1 0.68 0.57 0.62 561
accuracy 0.82 2110
macro avg 0.77 0.74 0.75 2110
weighted avg 0.81 0.82 0.81 2110
From the confusion matrix we can see that: There are total 1400+149=1549 actual non-churn values and the algorithm predicts 1400 of them as non churn and 149 of them as churn. While there are 237+324=561 actual churn values and the algorithm predicts 237 of them as non churn values and 324 of them as churn values.
cm = confusion_matrix(y_test, predictions)
labels = ['No Churn', 'Churn']
# Plot with pink-blue palette
plt.figure(figsize=(7, 5))
sns.heatmap(
cm,
annot=True,
fmt='d',
cmap=sns.color_palette(["#f3d7d9", "#6eaae2", "#669ab7", "#739ebd"], as_cmap=True),
xticklabels=labels,
yticklabels=labels,
linewidths=2,
linecolor='white',
cbar=False,
annot_kws={"size": 14, "weight": "bold", "color": "black"}
)
plt.title("Final Confusion Matrix", fontsize=16, weight='bold', color='#4B0082')
plt.xlabel("Predicted Label", fontsize=13)
plt.ylabel("True Label", fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()